import pandas as pd
import numpy as np
import sklearn
import pickle
import time
import datetime
import warnings
'ignore') warnings.filterwarnings(
imports
%run function_proposed_gcn.py
with open('fraudTrain.pkl', 'rb') as file:
= pickle.load(file) fraudTrain
= fraudTrain[::10]
df
= df.reset_index(drop=True)
df
df.is_fraud.mean()
= sklearn.model_selection.train_test_split(df) df_train, df_test
def edge_index(df, unique_col, theta, gamma):
= df.groupby(unique_col)
groups = np.array([item for sublist in (compute_time_difference(group) for _, group in groups) for item in sublist])
edge_index = edge_index.astype(np.float64)
edge_index # filename = f"edge_index{str(unique_col).replace(' ', '').replace('_', '')}.npy" # 저장
# np.save(filename, edge_index)
2] = (np.exp(-edge_index[:,2]/(theta)) != 1)*(np.exp(-edge_index[:,2]/(theta))).tolist()
edge_index[:,= torch.tensor([(int(row[0]), int(row[1])) for row in edge_index if row[2] > gamma], dtype=torch.long).t()
edge_index return edge_index
def gcn_data(df):
= torch.tensor(df['amt'].values, dtype=torch.float).reshape(-1,1)
x = torch.tensor(df['is_fraud'].values,dtype=torch.int64)
y = torch_geometric.data.Data(x=x, edge_index = edge_index, y=y, train_mask = mask[0], test_mask= mask[1])
data return data
def evaluation(y, yhat):
= [sklearn.metrics.accuracy_score,
metrics
sklearn.metrics.precision_score,
sklearn.metrics.recall_score,
sklearn.metrics.f1_score,
sklearn.metrics.roc_auc_score]return pd.DataFrame({m.__name__:[m(y,yhat).round(6)] for m in metrics})
def compute_time_difference(group):
= len(group)
n = []
result for i in range(n):
for j in range(n):
= abs((group.iloc[i].trans_date_trans_time - group.iloc[j].trans_date_trans_time).total_seconds())
time_difference
result.append([group.iloc[i].name, group.iloc[j].name, time_difference])return result
= df.groupby('cc_num') groups
= np.array([item for sublist in (compute_time_difference(group) for _, group in groups) for item in sublist])
edge_index = edge_index.astype(np.float64) edge_index
= f"edge_index_01.npy" # 저장
filename np.save(filename, edge_index)
2].mean() edge_index[:,
11661268.505973412
2] = (np.exp(-edge_index[:,2]/(theta)) != 1)*(np.exp(-edge_index[:,2]/(theta))).tolist() edge_index[:,
= df.groupby(unique_col)
groups = np.array([item for sublist in (compute_time_difference(group) for _, group in groups) for item in sublist])
edge_index = edge_index.astype(np.float64)
edge_index # filename = f"edge_index{str(unique_col).replace(' ', '').replace('_', '')}.npy" # 저장
# np.save(filename, edge_index)
2] = (np.exp(-edge_index[:,2]/(theta)) != 1)*(np.exp(-edge_index[:,2]/(theta))).tolist()
edge_index[:,= torch.tensor([(int(row[0]), int(row[1])) for row in edge_index if row[2] > gamma], dtype=torch.long).t() edge_index
= edge_index(df,'cc_num', 8.028000e+04, 0.3)
edge_index = gcn_data(df)
data = GCN1()
model = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
optimizer = (data.y[data.test_mask]).numpy()
yy = train_and_evaluate_model(data, model, optimizer)
yyhat =evaluation(yy,yyhat) results1
NameError: name 'df3' is not defined
8.028000e+04
80280.0
= try_2(fraudTrain, 10, 0.0058, 0.005836, 0.8, 0.3) df_results
IndexError: index 0 is out of bounds for dimension 0 with size 0
df_results
try_2??
Signature: try_2(df, fraud_rate, test_fraud_rate, theta, gamma, prev_results=None) Docstring: <no docstring> Source: def try_2(df, fraud_rate, test_fraud_rate, theta, gamma, prev_results=None): if prev_results is None: df_results = pd.DataFrame(columns=[ 'model', 'time', 'acc', 'pre', 'rec', 'f1', 'auc', 'graph_based', 'method', 'throw_rate', 'train_size', 'train_cols', 'train_frate', 'test_size', 'test_frate', 'hyper_params', 'theta', 'gamma' ]) else: df_results = prev_results.copy() df = df[::10] df = df.reset_index() df_tr, df_tst = sklearn.model_selection.train_test_split(df) df = pd.concat([df_tr, df_tst]) train_mask = np.concatenate((np.full(len(df_tr), True), np.full(len(df_tst), False))) # index꼬이는거 방지하기 위해서? ★ (이거,, 훔,,?( test_mask = np.concatenate((np.full(len(df_tr), False), np.full(len(df_tst), True))) mask = (train_mask, test_mask) groups = df.groupby('cc_num') edge_index = np.array([item for sublist in (compute_time_difference(group) for _, group in groups) for item in sublist]) edge_index = edge_index.astype(np.float64) edge_index[:,2] = (np.exp(-edge_index[:,2]/(theta)) != 1)*(np.exp(-edge_index[:,2]/(theta))).tolist() edge_index = torch.tensor([(int(row[0]), int(row[1])) for row in edge_index if row[2] > gamma], dtype=torch.long).t() x = torch.tensor(df['amt'].values, dtype=torch.float).reshape(-1,1) y = torch.tensor(df['is_fraud'].values,dtype=torch.int64) data = torch_geometric.data.Data(x=x, edge_index = edge_index, y=y, train_mask = mask[0], test_mask= mask[1]) model = GCN1() optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4) yy = (data.y[data.test_mask]).numpy() yyhat, yyhat_ = train_and_evaluate_model(data, model, optimizer) yyhat_ = yyhat_.detach().numpy() eval = evaluation(yy, yyhat, yyhat_) result = { 'model': 'GCN', 'time': None, 'acc': eval['acc'], 'pre': eval['pre'], 'rec': eval['rec'], 'f1': eval['f1'], 'auc': eval['auc'], 'graph_based': True, 'method': 'Proposed', 'throw_rate': fraud_rate, 'train_size': len(df_tr), 'train_cols': 'amt', 'train_frate': df_tr.is_fraud.mean(), 'test_size': len(df_tst), 'test_frate': test_fraud_rate, 'hyper_params': None, 'theta': theta, 'gamma': gamma } df_results = df_results.append(result, ignore_index=True) return df_results File: ~/Dropbox/GNNpaper/posts/function_proposed_gcn.py Type: function